# -*- coding: utf-8 -*-
# !/usr/bin/env python
# !/usr/bin/python3
# @Time    : 2021/4/12 20:33
# @Author  : buke-freedom
# @File    : 爬取周榜.py
"""
# @description：

"""
import requests
import re
import string
import json
from lxml import etree

if __name__ == "__main__":
    douban_url = "https://movie.douban.com/chart"
    # UA伪装：将对应的User_Agent封装到一个字典中
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:86.0) Gecko/20100101 Firefox/86.0"
    }
    # 获取排行榜页面所有的数据
    page_text = requests.get(url=douban_url, headers=headers).text
    # 从排行榜页面提取一周口碑榜数据
    """
    <div class="name">
                <a onclick="moreurl(this, {from:'mv_week'})" href="https://movie.douban.com/subject/33432655/" class="">
                    困在时间里的父亲
                </a>
            </div>
    
    """
    ex = '<div class="name">.*?href="(.*?)" class=".*?</div>'
    movie_top_list = re.findall(ex, page_text, re.S)

    for href in movie_top_list:
        # 遍历所有的电影链接，获取每一部电影的数据
        movie = requests.get(url=href, headers=headers).text
        # 解析电影名字
        """
        < span
        property = "v:itemreviewed" > 困在时间里的父亲
        The
        Father < / span >
        
        <h1>
        <span property="v:itemreviewed">困在时间里的父亲 The Father</span>
            <span class="year">(2020)</span>
        </h1>
        """
        movie_ex = '<span property="v:itemreviewed">(.*?)</span>'
        movie_name = re.findall(movie_ex, movie, re.S)
        #str = '\u4e00-\u9fa5'
        #movie_zh_name = re.findall(str, movie_name, re.S)
        #movie_zh_name = re.match([\u4e00-\u9fa5],movie_name,flags=0)
        print(movie_name)

    # print(movie_top_list)
    # print("抓取结束！")
